Load the global variables

here::i_am("scripts/2.2_multiome_mouse_mammary_gland.Rmd")
mainDir = here::here()
source(knitr::purl(file.path(mainDir,"scripts/global_variables.Rmd"), quiet=TRUE))
source(knitr::purl(file.path(mainDir, "scripts/functions.Rmd"), quiet=TRUE))

set.seed(123)

1 Preparation and object creation

1.1 Setting directories and input files

inputDir_local = file.path(inputDir, "mm10", "mouse_mammary_gland", "one_cell_multiome")

outputDir_local = file.path(outputDir, "2.2_multiome_mouse_mammary_gland") ; if(!file.exists(outputDir_local)){dir.create(outputDir_local)}
outputDir_objects = file.path(outputDir_local, "objects") ; if(!file.exists(outputDir_objects)){dir.create(outputDir_objects)}
outputDir_plots = file.path(outputDir_local, "plots") ; if(!file.exists(outputDir_plots)){dir.create(outputDir_plots)}
outputDir_tables = file.path(outputDir_local, "tables") ; if(!file.exists(outputDir_tables)){dir.create(outputDir_tables)}


ld <- list.dirs(inputDir_local)

fragpaths <- list.files(ld[grepl("/.*fragmentFiles", ld)], full.names = TRUE)
fragpaths_k4 <- fragpaths[grepl(".*/h3k4me1/.*fragmentFiles/.*tsv.gz$", fragpaths)]
fragpaths_k27 <- fragpaths[grepl(".*/h3k27me3/.*fragmentFiles/.*tsv.gz$", fragpaths)]
fragpaths <- list(fragpaths_k27, fragpaths_k4)
names(fragpaths) <- c("h3k27me3", "h3k4me1")

rnapaths <- ld[grepl("/10XlikeMatrix_umi", ld)]

rm(ld)

1.2 Loading annotation

consensus_peaks <- list("h3k27me3" = toGRanges(file.path(annotDir, "CREneg_peaks_h3k27me3.bed"), format="BED", header=FALSE),
                        "h3k4me1" = toGRanges(file.path(annotDir, "CREneg_peaks_h3k4me1.bed"), format="BED", header=FALSE))

1.3 Loading the bigwigs and tables

ld <- list.dirs(inputDir)
bw_paths <- list.files(ld[grepl(".*/mouse_mammary_gland/bigwig.*", ld)], full.names = TRUE)
names(bw_paths) <- sub(".*D1888_CRE3-Mice8724_", "", bw_paths)
  
list_rna_bw <- list(rna_all = bw_paths[["all_rna_pseudobulk.bw"]],
                    rna_luminal = bw_paths[["cluster_1_rna_luminal.bw"]],
                    rna_basal = bw_paths[["cluster_0_rna_basal.bw"]],
                    rna_luminal_sc = bw_paths[["T0302_luminal_cell.bw"]],
                    rna_basal_sc = bw_paths[["T0290_basal_cell.bw"]])
rm(ld)

1.4 Loading the FACS data

ld <- list.dirs(inputDir_local)
facs_path <- list.files(ld[grepl("/facs", ld)], full.names = TRUE)
facs_data <- read.csv(facs_path)
rm(ld)
row.names(facs_data) <- facs_data$CB
facs_data <-  facs_data[, c(1,2)]
colnames(facs_data) <- c("CD24", "CD49f") 

1.5 Creating the object

## loading RNA data
rna.data <- Read10X(data.dir = rnapaths)
rna_seurat <- CreateSeuratObject(counts = rna.data,
                             min.cells = 1,
                             min.features = -1,
                             project = "mouse_mammary_gland")

## adding FACS annotation
rna_seurat <- AddMetaData(
  object = rna_seurat,
  metadata = facs_data
)

## creating the multiome seurat objects 
seurat_list <- list()

for (i in names(fragpaths)){
    message(paste0("### Making fragment object for ", i))
    total_counts <- CountFragments(fragpaths[[i]])
    barcodes <- total_counts$CB
    frags <- CreateFragmentObject(path = fragpaths[[i]], cells = barcodes)
    
    message(paste0("### Making 50k bin matrix for ", i))
    bin50k_kmatrix = GenomeBinMatrix(
    frags,
    genome = mm10,
    cells = NULL,
    binsize = 50000,
    process_n = 10000,
    sep = c(":", "_"),
    verbose = TRUE)
    
    message(paste0("### Making peak matrix for ", i))
    peak_matrix = FeatureMatrix(
      frags,
      features = consensus_peaks[[i]],
      cells = NULL,
      sep = c("-", "-"),
      verbose = TRUE)
    
    message(paste0("### Creating chromatin assay for ", i))
    chrom_assay <- CreateChromatinAssay(
      counts = bin50k_kmatrix,
      sep = c(":", "_"),
      genome = "mm10",
      fragments = fragpaths[[i]],
      min.cells = 1,
      min.features = -1)
    
    message(paste0("### Creating Seurat object for ", i))
    seurat <- CreateSeuratObject(
      counts = chrom_assay,
      assay = "bin_50k")
    
    message(paste0("### Adding peak assay for ", i))
    seurat[["peaks"]] <- CreateChromatinAssay(
    counts = peak_matrix, genome = "mm10")
    
    message(paste0("### Adding RNA assay for ", i))
    seurat <- AddMetaData(seurat, rna_seurat@meta.data)
    seurat[["RNA"]] <- rna_seurat@assays[["RNA"]]

    Annotation(seurat) <- annotations_mm10
    
    seurat <- AddMetaData(seurat, CountFragments(fragpaths[[i]]))
    seurat <- FRiP(object = seurat, assay = "peaks", total.fragments = "reads_count")
    seurat@meta.data[["orig.ident"]] <- i
    seurat_list[[i]] <- seurat
    
    rm(seurat)
    rm(frags)
    rm(total_counts)
    rm(barcodes)
    rm(fragments_per_cell)
    rm(bin50k_matrix)
    rm(peak_matrix)
    rm(chromatin_assay)
}

saveRDS(seurat_list, file.path(outputDir_objects, "seurat_list.rds"))

## merging h3k4me1 and h3k27me3 data into one object
seurat <- merge(seurat_list[[1]], seurat_list[[2]])
seurat <- JoinLayers(seurat, assay = "RNA")

saveRDS(seurat, file.path(outputDir_objects, "seurat_cre_step1.rds"))

1.6 Reload the object - step1

seurat <- readRDS(file.path(outputDir_objects, "seurat_cre_step1.rds"))
seurat_list <- readRDS(file.path(outputDir_objects, "seurat_list.rds"))

2 FACS annotation

2.1 Gating

facs_data <- facs_data %>%
  mutate(phenotype = case_when(
    (CD24 > 100000) | (CD24 > 25000 & CD49f < 8900) ~ "luminal",
    TRUE ~ "basal"
  ))

ggplot(facs_data, aes(x = CD49f, y = CD24, color = phenotype)) +
  geom_point(size = 3) + 
  labs(x = "CD49f", y = "CD24", title = "CD24 vs CD49f by Phenotype") +
  theme_minimal() +
  scale_color_manual(values = c("luminal" = mypal[5], "basal" = mypal[4]))

ggplot(facs_data, aes(x = log(CD49f), y = log(CD24), color = phenotype)) +
  geom_point(size = 3) + 
  labs(x = "log(CD49f)", y = "log(CD24)", title = "CD24 vs CD49f by Phenotype log") +
  theme_minimal() +
  scale_color_manual(values = c("luminal" = mypal[5], "basal" = mypal[4]))

2.2 Adding FACS data as Seurat metadata

##the data will appear as phenotype column
seurat <- AddMetaData(
  object = seurat,
  metadata = facs_data
)

3 QC

3.1 Weighted histograms

for (s in seurat_list){
  p1 <- plot_weighted_hist(s) + ggtitle(print(s@meta.data[["orig.ident"]][1]), " DNA fragments")
  p2 <- plot_weighted_hist(s, assay = "RNA") + ggtitle(print(s@meta.data[["orig.ident"]][1]), " RNA reads")
  p3 <- plot_weighted_hist(s, assay = "RNA_features") + ggtitle(print(s@meta.data[["orig.ident"]][1]), " RNA features")
  print(p1)
  print(p2)
  print(p3)
  rm(s)
}
## [1] "h3k27me3"
## [1] "h3k27me3"
## [1] "h3k27me3"
## [1] "h3k4me1"
## [1] "h3k4me1"
## [1] "h3k4me1"

### Filtering

min_reads_chromatin = 400
min_reads_rna = 1000

seurat@meta.data[["filtering"]] <- ifelse(seurat@meta.data$reads_count > min_reads_chromatin & 
                                          seurat@meta.data$nCount_RNA > min_reads_rna,
                                                                             'pass', 'fail')

3.2 FrIP and N fragments plots

p_frip <- VlnPlot(subset(seurat, subset = filtering == 'pass'), c("FRiP"),
                  group.by = "orig.ident", split.by = NULL, pt.size = 0) +
                  labs(title = "FRiP CRE mice") +
                  stat_summary(fun.y = median, geom='point', size = 2, colour = "black") +
                  theme(legend.position = "none") +
                  scale_fill_manual(values = c(mypal[3], mypal[2]))
print(p_frip)

p_count <- VlnPlot(subset(seurat, subset = filtering == 'pass'), c("reads_count"),
                   group.by = "orig.ident", split.by = NULL, pt.size = 0, y.max = 12000) +
                   labs(title = "Unique fragments per cell CRE") +
                   stat_summary(fun.y = median, geom='point', size = 2, colour = "black") +
                   theme(legend.position = "none") +
                   scale_fill_manual(values = c(mypal[3], mypal[2]))
print(p_count)

p_count_rna <- VlnPlot(subset(seurat, subset = filtering == 'pass'), c("nCount_RNA"),
                       group.by = "orig.ident", split.by = NULL, pt.size = 0) +
                       labs(title = "Unique reads per cell RNA CRE") +
                       stat_summary(fun.y = median, geom='point', size = 2, colour = "black") +
                       theme(legend.position = "none") +
                       scale_fill_manual(values = c(mypal[3], mypal[2]))
print(p_count_rna)

p_genes_rna <- VlnPlot(subset(seurat, subset = filtering == 'pass'), c("nFeature_RNA"),
                        group.by = "orig.ident", split.by = NULL, pt.size = 0) +
                        labs(title = "Unique genes per cell CRE") +
                        stat_summary(fun.y = median, geom='point', size = 2, colour = "black") +
                        theme(legend.position = "none") +
                        scale_fill_manual(values = c(mypal[3], mypal[2]))
print(p_genes_rna)

ggsave(file.path(outputDir_plots, "frip_mouse_mammary_gland.pdf"),
       plot = p_frip,
       device = "pdf",
       units = "mm",
       width = 100,
       height = 130)

ggsave(file.path(outputDir_plots, "frip_mouse_mammary_gland.pdf"),
       plot = p_count,
       device = "pdf",
       units = "mm",
       width = 100,
       height = 130)

ggsave(file.path(outputDir_plots, "frip_mouse_mammary_gland.pdf"),
       plot = p_genes_rna,
       device = "pdf",
       units = "mm",
       width = 100,
       height = 130)

ggsave(file.path(outputDir_plots, "frip_mouse_mammary_gland.pdf"),
       plot = p_count_rna,
       device = "pdf",
       units = "mm",
       width = 100,
       height = 130)

3.3 DNA to RNA reads

seurat <- subset(seurat, subset = filtering == "pass")

frags_to_rna_reads <- ggplot(seurat@meta.data, aes(x = reads_count, y = nCount_RNA, color = orig.ident)) +
                             geom_point(alpha = 1, stroke = 0, size = 2) +  
                             labs(x = "N unique DNA fragments",
                                  y =  "N unique RNA reads",
                                title = "N unique DNA fragments vs. N unique RNA reads") +
                              theme_classic() +
                              scale_color_manual(values = c(mypal[3], mypal[2]))

frags_to_genes <- ggplot(seurat@meta.data, aes(x = reads_count, y = nFeature_RNA, color = orig.ident)) +
                             geom_point(alpha = 1, stroke = 0, size = 2) +  
                             labs(x = "N unique DNA fragments",
                                  y =  "N unique genes",
                                title = "N unique genes vs. N unique RNA reads") +
                              theme_classic() +
                              scale_color_manual(values = c(mypal[3], mypal[2]))

genes_to_rna_reads <- ggplot(seurat@meta.data, aes(x = nFeature_RNA, y = nCount_RNA, color = orig.ident)) +
                             geom_point(alpha = 1, stroke = 0, size = 2) +  
                             labs(x = "N unique genes",
                                  y =  "N unique RNA reads",
                                title = "N unique genes vs. N unique RNA reads") +
                              theme_classic() +
                              scale_color_manual(values = c(mypal[3], mypal[2]))

frags_to_rna_reads
frags_to_genes
genes_to_rna_reads

ggsave(file.path(outputDir_plots, "frags_to_rna_reads.pdf"),
       plot = frags_to_rna_reads,
       device = "pdf",
       units = "mm",
       width = 120,
       height = 80)

ggsave(file.path(outputDir_plots, "frags_to_genes.pdf"),
       plot = frags_to_genes,
       device = "pdf",
       units = "mm",
       width = 120,
       height = 80)

ggsave(file.path(outputDir_plots, "genes_to_rna_reads.pdf"),
       plot = genes_to_rna_reads,
       device = "pdf",
       units = "mm",
       width = 120,
       height = 80)

4 Clustering

4.1 Clustering by RNA

seurat <- subset(seurat, subset = filtering == 'pass')
DefaultAssay(seurat) <- "RNA"

seurat <- SCTransform(seurat)
seurat <- RunPCA(seurat)
seurat <- RunUMAP(seurat, dims = 1:30, verbose = FALSE, seed.use = 42)

seurat <- FindNeighbors(seurat, dims = 1:30, verbose = FALSE)
seurat <- FindClusters(seurat, resolution = 0.2)

DimPlot(seurat, cols = c(mypal[11], mypal[10]))

saveRDS(seurat, file.path(outputDir_objects, "seurat_cre_processed_step2.rds"))
seurat <- readRDS(file.path(outputDir_objects, "seurat_cre_processed_step2.rds"))

4.2 Clustering by epigenomic mark

Separating h3k27me3 and h3k4me1 data

cre_list <- list("h3k4me1" = subset(seurat, subset = orig.ident == "h3k4me1"), 
                 "h3k27me3" = subset(seurat, subset = orig.ident == "h3k27me3"))

Running normalization, dim reduction and checking the principal components.

for (mark in names(cre_list)){
  DefaultAssay(cre_list[[mark]]) <- 'bin_50k'
  cre_list[[mark]] <- RunTFIDF(cre_list[[mark]], assay = 'bin_50k')
  cre_list[[mark]] <- FindTopFeatures(cre_list[[mark]], min.cutoff = 'q0')
  cre_list[[mark]] <- RunSVD(cre_list[[mark]])
  p <- DepthCor(cre_list[[mark]])
  print(p)
}

Component 1 should not be used.

for (mark in names(cre_list)){
    cre_list[[mark]] <- RunUMAP(object = cre_list[[mark]],
                                reduction = 'lsi',
                                dims = 2:30)
    cre_list[[mark]] <- FindNeighbors(object = cre_list[[mark]],
                                      reduction = 'lsi',
                                      dims = 2:30)
    cre_list[[mark]] <- FindClusters(object = cre_list[[mark]],
                                     algorithm = 3,
                                     resolution = 0.8)
    #p <- DimPlot(cre_list[[mark]])
    #print(p)
}

saveRDS(cre_list, file.path(outputDir_objects, "seurat_cre_list_by_mark_processed_step2.rds"))
cre_list <- readRDS(file.path(outputDir_objects, "seurat_cre_list_by_mark_processed_step2.rds"))

Quick UMAPs to see sequencing-FACS correspondence

DimPlot(object = seurat, label = TRUE, cols = c(mypal[11], mypal[10])) + labs(title = "RNA clusters")
DimPlot(object = seurat, label = TRUE, cols = c(mypal[4], mypal[5]), group.by = "phenotype") +
                         labs(title = "RNA clusters, FACS annotation")

DimPlot(object = cre_list[["h3k27me3"]], label = TRUE, cols = c(mypal[11], mypal[10])) + labs(title = "H3K27me3 clusters")
DimPlot(object = cre_list[["h3k27me3"]], label = TRUE, cols = c(mypal[4], mypal[5]), group.by = "phenotype") +
                         labs(title = "H3K27me3 clusters, FACS annotation")

DimPlot(object = cre_list[["h3k4me1"]], label = TRUE, cols = c(mypal[11], mypal[10])) + labs(title = "H3K4me1 clusters")
DimPlot(object = cre_list[["h3k4me1"]], label = TRUE, cols = c(mypal[4], mypal[5]), group.by = "phenotype") +
                         labs(title = "H3K4me1 clusters, FACS annotation")

In RNA and K27: cluster 0 = basal-like, cluster 1 = luminal-like. In K4 it is vice versa.

Adding this annotation to meta_data:

tmp_vector <- cre_list[["h3k27me3"]]@meta.data[["seurat_clusters"]]
annot_k27 <- ifelse(tmp_vector == 1, "luminal_like_k27", "basal_like_k27")
cre_list[["h3k27me3"]]@meta.data[["annot_k27"]] <- annot_k27

tmp_vector2 <- cre_list[["h3k27me3"]]@meta.data[["SCT_snn_res.0.2"]]
annot_rna <- ifelse(tmp_vector2 == 1, "luminal_rna", "basal_rna")
cre_list[["h3k27me3"]]@meta.data[["annot_rna"]] <- annot_rna

tmp_vector <- cre_list[["h3k4me1"]]@meta.data[["seurat_clusters"]]
annot_k4 <- ifelse(tmp_vector == 1, "basal_like_k4", "luminal_like_k4")
cre_list[["h3k4me1"]]@meta.data[["annot_k4"]] <- annot_k4

tmp_vector2 <- cre_list[["h3k4me1"]]@meta.data[["SCT_snn_res.0.2"]]
annot_rna <- ifelse(tmp_vector2 == 1, "luminal_rna", "basal_rna")
cre_list[["h3k4me1"]]@meta.data[["annot_rna"]] <- annot_rna

cre_list[["h3k4me1"]]<- SetIdent(cre_list[["h3k4me1"]], value = "annot_k4")
cre_list[["h3k27me3"]]<- SetIdent(cre_list[["h3k27me3"]], value = "annot_k27")

tmp_vector3 <- seurat@meta.data[["SCT_snn_res.0.2"]]
annot_rna <- ifelse(tmp_vector3 == 1, "luminal_rna", "basal_rna")
seurat@meta.data[["annot_rna"]] <- annot_rna

seurat <- SetIdent(seurat, value = "annot_rna")


saveRDS(cre_list, file.path(outputDir_objects, "seurat_cre_list_by_mark_processed_step3.rds"))
saveRDS(seurat, file.path(outputDir_objects, "seurat_cre_processed_step3.rds"))

Reload objects - step3

cre_list <- readRDS(file.path(outputDir_objects, "seurat_cre_list_by_mark_processed_step3.rds"))
seurat <- readRDS(file.path(outputDir_objects, "seurat_cre_processed_step3.rds"))

4.3 UMAP plots

4.3.1 UMAPs colored by different annotations

cre_umap_rna <- DimPlot(object = seurat, label = TRUE, cols = c(mypal[11], mypal[10]), group.by = "annot_rna") + labs(title = "RNA clusters")
cre_umap_rna_facs <- DimPlot(object = seurat, label = TRUE, cols = c(mypal[4], mypal[5]), group.by = "phenotype") +
                         labs(title = "RNA clusters, FACS annotation")

cre_umap_k4 <- DimPlot(object = cre_list[["h3k4me1"]], label = TRUE, cols = c(mypal[11], mypal[10]), group.by = "annot_k4") + labs(title = "H3K4me1 clusters")
cre_umap_k4_facs <- DimPlot(object = cre_list[["h3k4me1"]], label = TRUE, cols = c(mypal[4], mypal[5]), group.by = "phenotype") +
                         labs(title = "H3K4me1 clusters, FACS annotation")

cre_umap_k27 <- DimPlot(object = cre_list[["h3k27me3"]], label = TRUE, cols = c(mypal[11], mypal[10]), group.by = "annot_k27") + labs(title = "H3K27me3 clusters")
cre_umap_k27_facs <- DimPlot(object = cre_list[["h3k27me3"]], label = TRUE, cols = c(mypal[4], mypal[5]), group.by = "phenotype") +
                         labs(title = "H3K27me3 clusters, FACS annotation")

cre_umap_rna
cre_umap_rna_facs
cre_umap_k4
cre_umap_k4_facs
cre_umap_k27
cre_umap_k27_facs

ggsave(file.path(outputDir_plots, "cre_umap_rna.pdf"),
       plot = cre_umap_rna,
       device = "pdf",
       units = "mm",
       width = 120,
       height = 100)
ggsave(file.path(outputDir_plots, "cre_umap_rna_facs.pdf"),
       plot = cre_umap_rna_facs,
       device = "pdf",
       units = "mm",
       width = 120,
       height = 100)
ggsave(file.path(outputDir_plots, "cre_umap_k4.pdf"),
       plot = cre_umap_k4,
       device = "pdf",
       units = "mm",
       width = 100,
       height = 100)
ggsave(file.path(outputDir_plots, "cre_umap_k4_facs.pdf"),
       plot = cre_umap_k4_facs,
       device = "pdf",
       units = "mm",
       width = 80,
       height = 100)
ggsave(file.path(outputDir_plots, "cre_umap_k27.pdf"),
       plot = cre_umap_k27,
       device = "pdf",
       units = "mm",
       width = 100,
       height = 100)
ggsave(file.path(outputDir_plots, "cre_umap_k27_facs.pdf"),
       plot = cre_umap_k27_facs,
       device = "pdf",
       units = "mm",
       width = 80,
       height = 100)

4.3.2 Connected UMAP

# Extract UMAP coordinates from each representation
df1 <- cre_list[["h3k4me1"]]@reductions[["umap"]]@cell.embeddings %>% as.data.frame()
df2 <- seurat@reductions[["umap"]]@cell.embeddings %>% as.data.frame()
df3 <- cre_list[["h3k27me3"]]@reductions[["umap"]]@cell.embeddings %>% as.data.frame()

# Extract information about cell type or cluster
df1$cluster <- as.vector(cre_list[["h3k4me1"]]@meta.data[["annot_k4"]])
df2$cluster <- as.vector(seurat@meta.data[["annot_rna"]])
df3$cluster <- as.vector(cre_list[["h3k27me3"]]@meta.data[["annot_k27"]])

# Add the info about the mark
df1$mark <- "H3K4me1"
df2$mark <- "RNA"
df3$mark <- "H3K27me3"

# Shift umap_1 for each dataset 
df1$umap_1 <- df1$umap_1 - 15  # Shift H3K4me1 left
df3$umap_1 <- df3$umap_1 + 20  # Shift H3K27me3 right

# Create a list and bind dataframes into one
dat.umap.lst <- list(df1, df2, df3)
dat.umap.long <- dat.umap.lst %>% bind_rows()
dat.umap.long$cell <- c(rownames(df1), rownames(df2), rownames(df3))

# Define color palette for clusters

cbPalette.ctype <- rep(c(mypal[10], mypal[11]), 3)
names(cbPalette.ctype) <- unique(dat.umap.long$cluster)

connected_umap_cre <- ggplot(dat.umap.long, aes(x = umap_1, y = umap_2, color = cluster, group = cell)) +
                             geom_point(alpha = 1) +
                             geom_path(alpha = 0.1) +
                             scale_color_manual(values = cbPalette.ctype) +
                             theme_classic() + ggtitle("H3K4me1 -> RNA -> H3K27me3")
connected_umap_cre 

ggsave(file.path(outputDir_plots, "connected_umap_cre_larger.pdf"),
       plot = connected_umap_cre,
       device = "pdf",
       units = "mm",
       width = 260,
       height = 100)

ggsave(file.path(outputDir_plots, "connected_umap_cre_smaller.pdf"),
       plot = connected_umap_cre,
       device = "pdf",
       units = "mm",
       width = 220,
       height = 80)

5 Differential analysis

5.1 RNA cluster markers

rna_markers <- FindMarkers(seurat, assay = "SCT", ident.1 = "luminal_rna")
DotPlot(seurat,
        features = rownames(rna_markers)[1:20],
        cols = c("#EDE9E7", "#7e6148")) +
        coord_flip() + labs(title = "Luminal vs basal markers, RNA") 

write.csv(rna_markers, file.path(outputDir_tables, "cre_rna_cluster_markers.csv"))

5.2 Differentially “acessible” bins and peaks - basal vs luminal

DefaultAssay(cre_list[["h3k4me1"]]) <- 'bin_50k'
DefaultAssay(cre_list[["h3k27me3"]]) <- 'bin_50k'

da_bins_k4 <- FindMarkers(
  object = cre_list[["h3k4me1"]],
  ident.1 = "luminal_like_k4", 
  ident.2 = "basal_like_k4",
  test.use = 'wilcox',
  min.pct = 0.1
)
da_bins_k4$bin <- rownames(da_bins_k4)
da_bins_k4$query_region <- rownames(da_bins_k4)

da_bins_k27 <- FindMarkers(
  object = cre_list[["h3k27me3"]],
  ident.1 = "luminal_like_k27", 
  ident.2 = "basal_like_k27",
  test.use = 'wilcox',
  min.pct = 0.1
)
da_bins_k27$bin <- rownames(da_bins_k27)
da_bins_k27$query_region <- rownames(da_bins_k27)

DefaultAssay(cre_list[["h3k4me1"]]) <- 'peaks'
DefaultAssay(cre_list[["h3k27me3"]]) <- 'peaks'

da_peaks_k4 <- FindMarkers(
  object = cre_list[["h3k4me1"]],
  ident.1 = "luminal_like_k4", 
  ident.2 = "basal_like_k4",
  test.use = 'wilcox',
  min.pct = 0.1
)
da_peaks_k4$query_region <- rownames(da_peaks_k4) #to do inner_join with the closest feature
colnames(da_peaks_k4) <- c("p_val", "avg_log2FC", "pct.luminal_like", "pct.basal_like", "p_val_adj", "query_region")

da_peaks_k27 <- FindMarkers(
  object = cre_list[["h3k27me3"]],
  ident.1 = "luminal_like_k27", 
  ident.2 = "basal_like_k27",
  test.use = 'wilcox',
  min.pct = 0.1
)
da_peaks_k27$query_region <- rownames(da_peaks_k27)
colnames(da_peaks_k27) <- c("p_val", "avg_log2FC", "pct.luminal_like", "pct.basal_like", "p_val_adj", "query_region")

Finding closest features.

features_da_k27 <- ClosestFeature(cre_list[["h3k27me3"]], regions = rownames(da_peaks_k27), annotation = annotations_mm10)
features_da_k4 <- ClosestFeature(cre_list[["h3k4me1"]], regions = rownames(da_peaks_k4), annotation = annotations_mm10)

da_peaks_k27 <- inner_join(da_peaks_k27, features_da_k27, by = "query_region")
da_peaks_k4 <- inner_join(da_peaks_k4, features_da_k4, by = "query_region")

features_da_k27_bins <- ClosestFeature(cre_list[["h3k27me3"]], regions = rownames(da_bins_k27), annotation = annotations_mm10)
features_da_k4_bins <- ClosestFeature(cre_list[["h3k4me1"]], regions = rownames(da_bins_k4), annotation = annotations_mm10)

da_bins_k27 <- inner_join(da_bins_k27, features_da_k27_bins, by = "query_region")
da_bins_k4 <- inner_join(da_bins_k4, features_da_k4_bins, by = "query_region")



write.csv(da_peaks_k27, file.path(outputDir_tables, "cre_diff_peaks_with_closest_genes_k27.csv"))
write.csv(da_peaks_k4, file.path(outputDir_tables, "cre_diff_peaks_with_closest_genes_k4.csv"))

write.csv(da_bins_k27, file.path(outputDir_tables, "cre_diff_bins_with_closest_genes_k27.csv"))
write.csv(da_bins_k4, file.path(outputDir_tables, "cre_diff_bins_with_closest_genes_k4.csv"))
intersect(row.names(rna_markers[rna_markers$p_val_adj < 0.05, ]),
          da_bins_k27[da_bins_k27$p_val < 0.05, ]$gene_name)
#Top 3:  "Cxcl14"        "Muc4"          "Fst" 

intersect(row.names(rna_markers[rna_markers$p_val_adj < 0.05, ]),
          da_bins_k4[da_bins_k4$p_val < 0.05, ]$gene_name)

#Top 3: "Apoe"          "Cxcl14"        "Sparc" 

roi = c("Cxcl14", "Muc4", "Fst", "Apoe","Sparc")
VlnPlot(seurat, roi, assay = "SCT", group.by = "annot_rna", cols = c(mypal[11], mypal[10]))

6 Coverage plots

6.1 Using the epigenomic annotation

DefaultAssay(cre_list[["h3k4me1"]]) <- "bin_50k"
DefaultAssay(cre_list[["h3k27me3"]]) <- "bin_50k"

cre_list[["h3k4me1"]] <- SetIdent(cre_list[["h3k4me1"]], value = cre_list[["h3k4me1"]]@meta.data[["annot_k4"]])
cre_list[["h3k27me3"]] <- SetIdent(cre_list[["h3k27me3"]], value = cre_list[["h3k27me3"]]@meta.data[["annot_k27"]])

Sub-setting the max cell for single-cell tracks.

# take one well-covered luminal and basal cell that are non-ambiguously annotated by all annotations 
max_cell_k27 <- subset(cre_list[["h3k27me3"]], cells = c("L539C172", "L539C186"))
max_cell_k4 <- subset(cre_list[["h3k4me1"]], cells = c("L539C302", "L539C290")) 

6.1.1 Fst

Marker of basal cells by RNA. Different in H3K27me3 in basal-like and luminal-like cells by H3K27me3

#roi="Fst"
#chr13:114,404,115-114,509,113
roi <- GRanges(seqnames = Rle(c("chr13"), c(1)),
               ranges = IRanges(114440000, end = 114500000, names = "Fst"))

gene_plot <- AnnotationPlot(
  object = cre_list[["h3k4me1"]],
  region = roi)

cov_plot_k27 <- CoveragePlot(
  object = cre_list[["h3k27me3"]],
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 800) +
  ggtitle("H3K27me3 pseudobulk") + 
  scale_fill_manual(values = c(mypal[3], mypal[3], mypal[3]))

p_fst_k27 <- CombineTracks(plotlist = list(cov_plot_k27,gene_plot),
                       heights = c(15,8)) &
                       theme(axis.title.y = element_text(size = 7)) 
p_fst_expression <- VlnPlot(seurat, "Fst", assay = "SCT", group.by = "annot_rna", cols = c(mypal[11], mypal[10]))

p_fst_k27

p_fst_expression

6.1.2 Apoe

Marker of basal cells by RNA. Different in H3K4me1 in basal-like and luminal-like cells by H3K4me1

#roi="Apoe"
#chr7:19,684,744-19,704,653
roi <- GRanges(seqnames = Rle(c("chr7"), c(1)),
               ranges = IRanges(19689094, end = 19702731, names = "Apoe"))

gene_plot <- AnnotationPlot(
  object = cre_list[["h3k4me1"]],
  region = roi)

cov_plot_k4 <- CoveragePlot(
  object = cre_list[["h3k4me1"]],
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 800) +
  ggtitle("H3K4me1 pseudobulk") + 
  scale_fill_manual(values = c(mypal[2], mypal[2], mypal[2]))


p_apoe_k4 <- CombineTracks(plotlist = list(cov_plot_k4,gene_plot),
                       heights = c(15,8)) &
                       theme(axis.title.y = element_text(size = 7)) 
p_apoe_expression <- VlnPlot(seurat, "Apoe", assay = "SCT", group.by = "annot_rna", cols = c(mypal[11], mypal[10]))

p_apoe_k4

p_apoe_expression

ggsave(file.path(outputDir_plots, "fst_tracks_by_epigenetic_annotation.pdf"),
       plot = p_fst_k27,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 100)

ggsave(file.path(outputDir_plots, "apoe_tracks_by_epigenetic_annotation.pdf"),
       plot = p_apoe_k4,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 100)

6.2 Using the RNA annotation

cre_list[["h3k4me1"]] <- SetIdent(cre_list[["h3k4me1"]], value = cre_list[["h3k4me1"]]@meta.data[["annot_rna"]])
cre_list[["h3k27me3"]] <- SetIdent(cre_list[["h3k27me3"]], value = cre_list[["h3k27me3"]]@meta.data[["annot_rna"]])
max_cell_k27 <- SetIdent(max_cell_k27, value = max_cell_k27@meta.data$annot_rna)
max_cell_k4 <- SetIdent(max_cell_k4, value = max_cell_k4@meta.data$annot_rna)
ggsave(file.path(outputDir_plots, "Plin2_tracks_by_rna.pdf"),
       plot = p_Plin2,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 240)

ggsave(file.path(outputDir_plots, "Plin2_tiles_by_rna.pdf"),
       plot = tiles_Plin2,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 150)

6.3 Keratines

#Keratine locus
#chr11:100,117,140-100,448,915

roi <- GRanges(seqnames = Rle(c("chr11"), c(1)),
               ranges = IRanges(start = 100117140, end = 100339961, names = "Krt"))

gene_plot <- AnnotationPlot(
  object = cre_list[["h3k4me1"]],
  region = roi)

cov_plot_k27 <- CoveragePlot(
  object = cre_list[["h3k27me3"]],
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 4000) +
  ggtitle("H3K27me3 pseudobulk") + 
  scale_fill_manual(values = c(mypal[3], mypal[3], mypal[3]))

cov_plot_k27_max <- CoveragePlot(
  object = max_cell_k27,
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 4000) +
  ggtitle("H3K27me3 single cell") +
  scale_fill_manual(values = c(mypal[3], mypal[3], mypal[3]))

cov_plot_k4 <- CoveragePlot(
  object = cre_list[["h3k4me1"]],
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 4000) +
  ggtitle("H3K4me1 pseudobulk") + 
  scale_fill_manual(values = c(mypal[2], mypal[2], mypal[2]))

cov_plot_k4_max <- CoveragePlot(
  object = max_cell_k4,
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 4000) +
  ggtitle("H3K4me1 single cell") +
  scale_fill_manual(values = c(mypal[2], mypal[2], mypal[2]))

cov_plot_rna <- BigwigTrack(
                     bigwig = list_rna_bw,
                     region = roi,
                     smooth = 1000,
                     type = "coverage",
                     y_label = "Normalised signal") +
                     scale_fill_manual(values = rep(mypal[1], 5)) +
                     theme(legend.position = "none") + 
                     ggtitle("RNA pseudobulk")
  

p_krt <- CombineTracks(plotlist = list(cov_plot_rna,
                                       cov_plot_k27_max, cov_plot_k27,
                                       cov_plot_k4_max, cov_plot_k4, gene_plot),
                       heights = c(40,15,15,15,15,8)) &
                       theme(axis.title.y = element_text(size = 7)) 

tile_plot_k27 <- TilePlot(
  object = cre_list[["h3k27me3"]],
  region = roi,
  tile.cells = 70,
  tile.size = 2000) +  scale_fill_gradient(low = "white", high = "#00806c")


tile_plot_k4 <- TilePlot(
  object = cre_list[["h3k4me1"]],
  region = roi,
  tile.cells = 70,
  tile.size = 2000) +  scale_fill_gradient(low = "white", high = "#1c6779")

tiles_krt <- CombineTracks(plotlist = list(tile_plot_k27, tile_plot_k4),
                       heights = c(15,15)) & theme(axis.title.y = element_text(size = 7)) 

p_krt

tiles_krt

ggsave(file.path(outputDir_plots, "krt_tracks_by_rna.pdf"),
       plot = p_krt,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 240)

ggsave(file.path(outputDir_plots, "krt_tiles_by_rna.pdf"),
       plot = tiles_krt,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 150)
6.3.0.1 Krt19
#roi="Krt19"
#chr11:100,138,496-100,148,799

roi <- GRanges(seqnames = Rle(c("chr11"), c(1)),
               ranges = IRanges(100138496, end = 100148799, names = "Krt19"))

gene_plot <- AnnotationPlot(
  object = cre_list[["h3k4me1"]],
  region = roi)

cov_plot_k27 <- CoveragePlot(
  object = cre_list[["h3k27me3"]],
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 400) +
  ggtitle("H3K27me3 pseudobulk") + 
  scale_fill_manual(values = c(mypal[3], mypal[3], mypal[3]))

cov_plot_k27_max <- CoveragePlot(
  object = max_cell_k27,
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 400) +
  ggtitle("H3K27me3 single cell") +
  scale_fill_manual(values = c(mypal[3], mypal[3], mypal[3]))

cov_plot_k4 <- CoveragePlot(
  object = cre_list[["h3k4me1"]],
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 400) +
  ggtitle("H3K4me1 pseudobulk") + 
  scale_fill_manual(values = c(mypal[2], mypal[2], mypal[2]))

cov_plot_k4_max <- CoveragePlot(
  object = max_cell_k4,
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 400) +
  ggtitle("H3K4me1 single cell") +
  scale_fill_manual(values = c(mypal[2], mypal[2], mypal[2]))

cov_plot_rna <- BigwigTrack(
                     bigwig = list_rna_bw,
                     region = roi,
                     smooth = 200,
                     type = "coverage",
                     y_label = "Normalised signal") +
                     scale_fill_manual(values = rep(mypal[1], 5)) +
                     theme(legend.position = "none") + 
                     ggtitle("RNA pseudobulk")
  

p_krt19 <- CombineTracks(plotlist = list(cov_plot_rna,
                                       cov_plot_k27_max, cov_plot_k27,
                                       cov_plot_k4_max, cov_plot_k4, gene_plot),
                       heights = c(30,15,15,15,15,8)) &
                       theme(axis.title.y = element_text(size = 7)) 

tile_plot_k27 <- TilePlot(
  object = cre_list[["h3k27me3"]],
  region = roi,
  tile.cells = 70,
  tile.size = 400) +  scale_fill_gradient(low = "white", high = "#00806c")


tile_plot_k4 <- TilePlot(
  object = cre_list[["h3k4me1"]],
  region = roi,
  tile.cells = 70,
  tile.size = 400) +  scale_fill_gradient(low = "white", high = "#1c6779")

tiles_krt19 <- CombineTracks(plotlist = list(tile_plot_k27, tile_plot_k4),
                       heights = c(15,15)) & theme(axis.title.y = element_text(size = 7)) 

p_krt19

tiles_krt19

ggsave(file.path(outputDir_plots, "krt19_tracks_by_rna.pdf"),
       plot = p_krt19,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 240)

ggsave(file.path(outputDir_plots, "krt19_tiles_by_rna.pdf"),
       plot = tiles_krt19,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 150)
6.3.0.2 Krt14
#roi="Krt14"
#chr11:100,198,634-100,211,215

roi <- GRanges(seqnames = Rle(c("chr11"), c(1)),
               ranges = IRanges(100198634, end = 100211215, names = "Krt14"))

gene_plot <- AnnotationPlot(
  object = cre_list[["h3k4me1"]],
  region = roi)

cov_plot_k27 <- CoveragePlot(
  object = cre_list[["h3k27me3"]],
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 400) +
  ggtitle("H3K27me3 pseudobulk") + 
  scale_fill_manual(values = c(mypal[3], mypal[3], mypal[3]))

cov_plot_k27_max <- CoveragePlot(
  object = max_cell_k27,
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 400) +
  ggtitle("H3K27me3 single cell") +
  scale_fill_manual(values = c(mypal[3], mypal[3], mypal[3]))

cov_plot_k4 <- CoveragePlot(
  object = cre_list[["h3k4me1"]],
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 400) +
  ggtitle("H3K4me1 pseudobulk") + 
  scale_fill_manual(values = c(mypal[2], mypal[2], mypal[2]))

cov_plot_k4_max <- CoveragePlot(
  object = max_cell_k4,
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 400) +
  ggtitle("H3K4me1 single cell") +
  scale_fill_manual(values = c(mypal[2], mypal[2], mypal[2]))

cov_plot_rna <- BigwigTrack(
                     bigwig = list_rna_bw,
                     region = roi,
                     smooth = 200,
                     type = "coverage",
                     y_label = "Normalised signal") +
                     scale_fill_manual(values = rep(mypal[1], 5)) +
                     theme(legend.position = "none") + 
                     ggtitle("RNA pseudobulk")
  

p_krt14 <- CombineTracks(plotlist = list(cov_plot_rna,
                                       cov_plot_k27_max, cov_plot_k27,
                                       cov_plot_k4_max, cov_plot_k4, gene_plot),
                       heights = c(30,15,15,15,15,8)) &
                       theme(axis.title.y = element_text(size = 7)) 

tile_plot_k27 <- TilePlot(
  object = cre_list[["h3k27me3"]],
  region = roi,
  tile.cells = 70,
  tile.size = 200) +  scale_fill_gradient(low = "white", high = "#00806c")


tile_plot_k4 <- TilePlot(
  object = cre_list[["h3k4me1"]],
  region = roi,
  tile.cells = 70,
  tile.size = 200) +  scale_fill_gradient(low = "white", high = "#1c6779")

tiles_krt14 <- CombineTracks(plotlist = list(tile_plot_k27, tile_plot_k4),
                       heights = c(15,15)) & theme(axis.title.y = element_text(size = 7)) 

tiles_krt14

p_krt14

ggsave(file.path(outputDir_plots, "krt14_tracks_by_rna.pdf"),
       plot = p_krt14,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 240)

ggsave(file.path(outputDir_plots, "krt14_tiles_by_rna.pdf"),
       plot = tiles_krt14,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 150)
6.3.0.3 Krt17
#roi="Krt17"
#chr11:100,252,878-100,265,917

roi <- GRanges(seqnames = Rle(c("chr11"), c(1)),
               ranges = IRanges(100252878, end = 100265917, names = "Krt17"))

gene_plot <- AnnotationPlot(
  object = cre_list[["h3k4me1"]],
  region = roi)

cov_plot_k27 <- CoveragePlot(
  object = cre_list[["h3k27me3"]],
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 400) +
  ggtitle("H3K27me3 pseudobulk") + 
  scale_fill_manual(values = c(mypal[3], mypal[3], mypal[3]))

cov_plot_k27_max <- CoveragePlot(
  object = max_cell_k27,
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 400) +
  ggtitle("H3K27me3 single cell") +
  scale_fill_manual(values = c(mypal[3], mypal[3], mypal[3]))

cov_plot_k4 <- CoveragePlot(
  object = cre_list[["h3k4me1"]],
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 400) +
  ggtitle("H3K4me1 pseudobulk") + 
  scale_fill_manual(values = c(mypal[2], mypal[2], mypal[2]))

cov_plot_k4_max <- CoveragePlot(
  object = max_cell_k4,
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 400) +
  ggtitle("H3K4me1 single cell") +
  scale_fill_manual(values = c(mypal[2], mypal[2], mypal[2]))

cov_plot_rna <- BigwigTrack(
                     bigwig = list_rna_bw,
                     region = roi,
                     smooth = 200,
                     type = "coverage",
                     y_label = "Normalised signal") +
                     scale_fill_manual(values = rep(mypal[1], 5)) +
                     theme(legend.position = "none") + 
                     ggtitle("RNA pseudobulk")
  

p_krt17 <- CombineTracks(plotlist = list(cov_plot_rna,
                                       cov_plot_k27_max, cov_plot_k27,
                                       cov_plot_k4_max, cov_plot_k4, gene_plot),
                       heights = c(30,15,15,15,15,8)) &
                       theme(axis.title.y = element_text(size = 7)) 

tile_plot_k27 <- TilePlot(
  object = cre_list[["h3k27me3"]],
  region = roi,
  tile.cells = 70,
  tile.size = 200) +  scale_fill_gradient(low = "white", high = "#00806c")


tile_plot_k4 <- TilePlot(
  object = cre_list[["h3k4me1"]],
  region = roi,
  tile.cells = 70,
  tile.size = 200) +  scale_fill_gradient(low = "white", high = "#1c6779")

tiles_krt17 <- CombineTracks(plotlist = list(tile_plot_k27, tile_plot_k4),
                       heights = c(15,15)) & theme(axis.title.y = element_text(size = 7)) 

p_krt17

tiles_krt17

ggsave(file.path(outputDir_plots, "krt17_tracks_by_rna.pdf"),
       plot = p_krt17,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 240)

ggsave(file.path(outputDir_plots, "krt17_tiles_by_rna.pdf"),
       plot = tiles_krt17,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 150)

6.4 A region with differences in both K27 & K4

roi <- GRanges(seqnames = Rle(c("chr2"), c(1)),
                       ranges = IRanges(33005895, end = 34356499, names = "roi"))

gene_plot <- AnnotationPlot(
  object = cre_list[["h3k4me1"]],
  region = roi)

cov_plot_k27 <- CoveragePlot(
  object = cre_list[["h3k27me3"]],
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 10000) +
  ggtitle("H3K27me3 pseudobulk") + 
  scale_fill_manual(values = c(mypal[3], mypal[3], mypal[3]))

cov_plot_k27_max <- CoveragePlot(
  object = max_cell_k27,
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 10000) +
  ggtitle("H3K27me3 single cell") +
  scale_fill_manual(values = c(mypal[3], mypal[3], mypal[3]))

cov_plot_k4 <- CoveragePlot(
  object = cre_list[["h3k4me1"]],
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 10000) +
  ggtitle("H3K4me1 pseudobulk") + 
  scale_fill_manual(values = c(mypal[2], mypal[2], mypal[2]))

cov_plot_k4_max <- CoveragePlot(
  object = max_cell_k4,
  region = roi,
  annotation = FALSE,
  peaks = FALSE,
  window = 10000) +
  ggtitle("H3K4me1 single cell") +
  scale_fill_manual(values = c(mypal[2], mypal[2], mypal[2]))

cov_plot_rna <- BigwigTrack(
                     bigwig = list_rna_bw,
                     region = roi,
                     smooth = 10000,
                     type = "coverage",
                     y_label = "Normalised signal") +
                     scale_fill_manual(values = rep(mypal[1], 5)) +
                     theme(legend.position = "none") + 
                     ggtitle("RNA pseudobulk")
  
p <- CombineTracks(plotlist = list(cov_plot_rna,
                                       cov_plot_k27_max, cov_plot_k27,
                                       cov_plot_k4_max, cov_plot_k4, gene_plot),
                       heights = c(30,15,15,15,15,8)) &
                       theme(axis.title.y = element_text(size = 7)) 

tile_plot_k27 <- TilePlot(
  object = cre_list[["h3k27me3"]],
  region = roi,
  tile.cells = 70,
  tile.size = 5000) +  scale_fill_gradient(low = "white", high = "#00806c")


tile_plot_k4 <- TilePlot(
  object = cre_list[["h3k4me1"]],
  region = roi,
  tile.cells = 70,
  tile.size = 5000) +  scale_fill_gradient(low = "white", high = "#1c6779")

tiles <- CombineTracks(plotlist = list(tile_plot_k27, tile_plot_k4),
                       heights = c(15,15)) & theme(axis.title.y = element_text(size = 7)) 

tiles

p

ggsave(file.path(outputDir_plots, "region1_tracks_window10000_by_rna.pdf"),
       plot = p,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 240)

ggsave(file.path(outputDir_plots, "region1_tiles_window10000_by_rna.pdf"),
       plot = tiles,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 150)

6.5 Expression of the genes in these regions

roi_krt = c("Krt15", "Krt19", "Krt14", "Krt16", "Krt17", "Eif1")
p_genes_rna_krt <- VlnPlot(seurat, roi_krt, assay = "SCT", group.by = "annot_rna", cols = c(mypal[11], mypal[10]))


roi_region1 = c("Garnl3", "Ralgps1", "Zbtb34", "Lmx1b", "Mvb12b", "Gm13403", "C230014O12Rik", "Angptl2", "Zbtb43", "Gm13530", "9430024E24Rik", "Gm13404", "Pbx3")
p_genes_rna_region1 <- VlnPlot(seurat, roi_region1, assay = "SCT", group.by = "annot_rna", cols = c(mypal[11], mypal[10]))

p_genes_rna_krt

p_genes_rna_region1

ggsave(file.path(outputDir_plots, "genes_rna_region1.pdf"),
       plot = p_genes_rna_region1,
       device = "pdf",
       units = "mm",
       width = 250,
       height = 250)

ggsave(file.path(outputDir_plots, "genes_rna_krt.pdf"),
       plot = p_genes_rna_krt,
       device = "pdf",
       units = "mm",
       width = 200,
       height = 200)

7 Gene activity matrix UMAP

7.1 Calculating the gene activity matrix

gene_activity_k4 <- GeneActivity(cre_list[["h3k4me1"]],
                                 assay = "bin_50k",
                                 extend.upstream = 3000,
                                 extend.downstream = 0,
                                 process_n = 3000)

cre_list[["h3k4me1"]][['gene_activity']] <- CreateAssayObject(counts = gene_activity_k4)

gene_activity_k27 <- GeneActivity(cre_list[["h3k27me3"]],
                                 assay = "bin_50k",
                                 extend.upstream = 3000,
                                 extend.downstream = 0,
                                 process_n = 3000)
cre_list[["h3k27me3"]][['gene_activity']] <- CreateAssayObject(counts = gene_activity_k27)

7.2 Calculating differential gene activity

DefaultAssay(cre_list[["h3k27me3"]]) <- 'gene_activity'
cre_list[["h3k27me3"]] <- SetIdent(cre_list[["h3k27me3"]], value = "annot_rna")

DefaultAssay(cre_list[["h3k4me1"]]) <- 'gene_activity'
cre_list[["h3k4me1"]] <- SetIdent(cre_list[["h3k4me1"]], value = "annot_rna")

da_activity_k27 <- FindMarkers(
  object = cre_list[["h3k27me3"]],
  ident.1 = "luminal_rna", 
  ident.2 = "basal_rna",
  test.use = 'wilcox',
  min.pct = 0.1
)

da_activity_k27 <- da_activity_k27[da_activity_k27$p_val < 0.05, ]
da_activity_k27$gene <- row.names(da_activity_k27)
colnames(da_activity_k27) <- c("pval_k27", "log2FC_k27_luminal_vs_basal", "pct.1", "pct.2", "padj_k27", "gene")

da_activity_k4 <- FindMarkers(
  object = cre_list[["h3k4me1"]],
  ident.1 = "luminal_rna", 
  ident.2 = "basal_rna",
  test.use = 'wilcox',
  min.pct = 0.1
)

da_activity_k4 <- da_activity_k4[da_activity_k4$p_val < 0.05, ]
da_activity_k4$gene <- row.names(da_activity_k4)
colnames(da_activity_k4) <- c("pval_k4", "log2FC_k4_luminal_vs_basal", "pct.1", "pct.2", "padj_k4", "gene")

7.3 Preparing the UMAP input matrix

meta_k4 <- data.frame(annot = cre_list[["h3k4me1"]]@meta.data[["annot_rna"]],
           cb = cre_list[["h3k4me1"]]@meta.data[["CB"]])
meta_k27 <- data.frame(annot = cre_list[["h3k27me3"]]@meta.data[["annot_rna"]],
           cb = cre_list[["h3k27me3"]]@meta.data[["CB"]])

luminal_matrix_k4 <- gene_activity_k4[, colnames(gene_activity_k4) %in% meta_k4$cb[meta_k4$annot == "luminal_rna"], drop = FALSE]
basal_matrix_k4 <- gene_activity_k4[, colnames(gene_activity_k4) %in% meta_k4$cb[meta_k4$annot == "basal_rna"], drop = FALSE]


luminal_matrix_k27 <- gene_activity_k27[, colnames(gene_activity_k27) %in% meta_k27$cb[meta_k27$annot == "luminal_rna"], drop = FALSE]
basal_matrix_k27 <- gene_activity_k27[, colnames(gene_activity_k27) %in% meta_k27$cb[meta_k27$annot == "basal_rna"], drop = FALSE]

df <- data.frame(
  gene = rownames(gene_activity_k4),
  basal_signal_k27 = rowSums(basal_matrix_k27),
  basal_signal_k4 = rowSums(basal_matrix_k4),
  luminal_signal_k27 = rowSums(luminal_matrix_k27),    
  luminal_signal_k4 = rowSums(luminal_matrix_k4),
  pct_basal_k27 = rowSums(basal_matrix_k27 != 0) / ncol(basal_matrix_k27) * 100,
  pct_luminal_k27 = rowSums(luminal_matrix_k27 != 0) / ncol(luminal_matrix_k27) * 100,
  pct_basal_k4 = rowSums(basal_matrix_k4 != 0) / ncol(basal_matrix_k4) * 100,
  pct_luminal_k4 = rowSums(luminal_matrix_k4 != 0) / ncol(luminal_matrix_k4) * 100
)

# filtering of low covered regions
df <- df[df$luminal_signal_k4 + df$basal_signal_k4 + df$luminal_signal_k27 + df$basal_signal_k27 > 25, ]

#as we have more basal than luminal cells, will normalize the signal, so that each cell has equal input
norm_factor <- length(meta_k4$cb[meta_k4$annot == "basal_rna"]) / length(meta_k4$cb[meta_k4$annot == "luminal_rna"])

df$basal_signal_k27_norm <- df$basal_signal_k27 / norm_factor
df$basal_signal_k4_norm <- df$basal_signal_k4 / norm_factor 


df <- merge(df, da_activity_k27[, c("gene", "log2FC_k27_luminal_vs_basal")], by = "gene", all.x = TRUE)
df <- merge(df, da_activity_k4[, c("gene", "log2FC_k4_luminal_vs_basal")], by = "gene", all.x = TRUE)

# set the LF of all non-differential genes as 0
df <- df %>% mutate_all(~replace(., is.na(.), 0))

### For k4 only
df_k4 <- data.frame(
  gene = rownames(gene_activity_k4),
  basal_signal_k27 = rowSums(basal_matrix_k27),
  basal_signal_k4 = rowSums(basal_matrix_k4),
  luminal_signal_k27 = rowSums(luminal_matrix_k27),    
  luminal_signal_k4 = rowSums(luminal_matrix_k4),
  pct_basal_k4 = rowSums(basal_matrix_k4 != 0) / ncol(basal_matrix_k4) * 100,
  pct_luminal_k4 = rowSums(luminal_matrix_k4 != 0) / ncol(luminal_matrix_k4) * 100,
  pct_basal_k27 = rowSums(basal_matrix_k27 != 0) / ncol(basal_matrix_k27) * 100,
  pct_luminal_k27 = rowSums(luminal_matrix_k27 != 0) / ncol(luminal_matrix_k27) * 100)

df_k4 <- df_k4[df_k4$luminal_signal_k4 + df_k4$basal_signal_k4 > 10, ]

df_k4$basal_signal_k4_norm <- df_k4$basal_signal_k4 / norm_factor 

df_k4 <- merge(df_k4, da_activity_k27[, c("gene", "log2FC_k27_luminal_vs_basal")], by = "gene", all.x = TRUE)
df_k4 <- merge(df_k4, da_activity_k4[, c("gene", "log2FC_k4_luminal_vs_basal")], by = "gene", all.x = TRUE)

df_k4 <- df_k4 %>% mutate_all(~replace(., is.na(.), 0))


### For k27 only
df_k27 <- data.frame(
  gene = rownames(gene_activity_k4),
  basal_signal_k27 = rowSums(basal_matrix_k27),
  basal_signal_k4 = rowSums(basal_matrix_k4),
  luminal_signal_k27 = rowSums(luminal_matrix_k27),    
  luminal_signal_k4 = rowSums(luminal_matrix_k4),
  pct_basal_k27 = rowSums(basal_matrix_k27 != 0) / ncol(basal_matrix_k27) * 100,
  pct_luminal_k27 = rowSums(luminal_matrix_k27 != 0) / ncol(luminal_matrix_k27) * 100,
  pct_basal_k4 = rowSums(basal_matrix_k4 != 0) / ncol(basal_matrix_k4) * 100,
  pct_luminal_k4 = rowSums(luminal_matrix_k4 != 0) / ncol(luminal_matrix_k4) * 100)


df_k27 <- df_k27[df_k27$luminal_signal_k27 + df_k27$basal_signal_k27 > 10, ]

df_k27$basal_signal_k27_norm <- df_k27$basal_signal_k27 / norm_factor 

df_k27 <- merge(df_k27, da_activity_k27[, c("gene", "log2FC_k27_luminal_vs_basal")], by = "gene", all.x = TRUE)
df_k27 <- merge(df_k27, da_activity_k4[, c("gene", "log2FC_k4_luminal_vs_basal")], by = "gene", all.x = TRUE)

df_k27 <- df_k27 %>% mutate_all(~replace(., is.na(.), 0))

7.3.1 Intercellular entropy calcluation

### Function to calculate entropy based on the transposed gene activity matrix
calculate_entropy <- function(gene_expression) {
  # Normalize expression to get probabilities
  total_expression <- sum(gene_expression)
  
  if (total_expression == 0) {
    return(0)  # If no expression, entropy is 0
  }
  
  probabilities <- gene_expression / total_expression
  
  entropy <- -sum(probabilities * log2(probabilities + 1e-10))  # Adding small value to avoid log(0)
  return(entropy)
}


intercellular_entropy_k4 <- as.data.frame(apply(as.data.frame(t(gene_activity_k4)), 2, calculate_entropy)) #2 - columns
intercellular_entropy_k27 <- as.data.frame(apply(as.data.frame(t(gene_activity_k27)), 2, calculate_entropy)) 

colnames(intercellular_entropy_k4) <- "entropy_k4"
colnames(intercellular_entropy_k27) <- "entropy_k27"

intercellular_entropy_k27$gene <- row.names(intercellular_entropy_k27)
intercellular_entropy_k4$gene <- row.names(intercellular_entropy_k4)

df_k4 <- left_join(df_k4, intercellular_entropy_k4, by = "gene")
df_k4 <- left_join(df_k4, intercellular_entropy_k27, by = "gene")
df_k27 <- left_join(df_k27, intercellular_entropy_k27, by = "gene")
df_k27 <- left_join(df_k27, intercellular_entropy_k4, by = "gene")



## Separately for basal and luminal

intercellular_entropy_k27_luminal <- as.data.frame(apply(as.data.frame(t(luminal_matrix_k27)), 2, calculate_entropy)) 
colnames(intercellular_entropy_k27_luminal) <- "entropy_k27_luminal"
intercellular_entropy_k27_luminal$gene <- row.names(intercellular_entropy_k27_luminal)
df_k27 <- left_join(df_k27, intercellular_entropy_k27_luminal, by = "gene")


intercellular_entropy_k27_basal <- as.data.frame(apply(as.data.frame(t(basal_matrix_k27)), 2, calculate_entropy)) 
colnames(intercellular_entropy_k27_basal) <- "entropy_k27_basal"
intercellular_entropy_k27_basal$gene <- row.names(intercellular_entropy_k27_basal)
df_k27 <- left_join(df_k27, intercellular_entropy_k27_basal, by = "gene")


#luminal_matrix_k27
#basal_matrix_k27

7.3.2 UMAP calculation

All together

set.seed(123)
umap_gene_activity_lf_pval0.05 <- umap(df[, c("luminal_signal_k27", "luminal_signal_k4",
                                            "basal_signal_k27_norm", "basal_signal_k4_norm",
                                            "log2FC_k27_luminal_vs_basal", "log2FC_k4_luminal_vs_basal")],
                                       n_neighbors = 200,
                                       min_dist = 0.1)

saveRDS(umap_gene_activity_lf_pval0.05, file.path(outputDir_objects, "umap_gene_activity_filt25_lf_pval00.5.rds"))

Separate by mark

set.seed(123)

umap_gene_activity_lf_pval0.05_k4 <- umap(df_k4[, c("luminal_signal_k4", "basal_signal_k4_norm",
                                                    "log2FC_k4_luminal_vs_basal")],
                                       n_neighbors = 200,
                                       min_dist = 0.1)

umap_gene_activity_lf_pval0.05_k27 <- umap(df_k27[, c("luminal_signal_k27", "basal_signal_k27_norm",
                                                    "log2FC_k27_luminal_vs_basal")],
                                       n_neighbors = 200,
                                       min_dist = 0.1)


saveRDS(umap_gene_activity_lf_pval0.05_k27, file.path(outputDir_objects, "umap_gene_activity_filt10_lf_pval00.5_k27.rds"))
saveRDS(umap_gene_activity_lf_pval0.05_k4, file.path(outputDir_objects, "umap_gene_activity_filt10_lf_pval00.5_k4.rds"))

7.4 UMAP plots (all info)

log1p_trans <- trans_new("log1p", transform = function(x) log(x + 1), inverse = function(x) exp(x) - 1)
umap_gene_activity_lf_pval0.05 <- readRDS(file.path(outputDir_objects, "umap_gene_activity_filt25_lf_pval00.5.rds"))


umap_embeddings <- as.data.frame(umap_gene_activity_lf_pval0.05$layout)
colnames(umap_embeddings) <- c("UMAP_1", "UMAP_2")
umap_embeddings$gene <- df$gene
umap_embeddings$signal_k4 <- df$basal_signal_k4 + df$luminal_signal_k4
umap_embeddings$signal_k27 <- df$basal_signal_k27 + df$luminal_signal_k27
umap_embeddings$signal_k4_basal <- df$basal_signal_k4
umap_embeddings$signal_k27_basal <- df$basal_signal_k27
umap_embeddings$signal_k4_luminal <- df$luminal_signal_k4
umap_embeddings$signal_k27_luminal <- df$luminal_signal_k27
umap_embeddings$log2FC_k27_luminal_vs_basal <- df$log2FC_k27_luminal_vs_basal
umap_embeddings$log2FC_k4_luminal_vs_basal <- df$log2FC_k4_luminal_vs_basal
umap_embeddings$signal_k4_basal_norm <- df$basal_signal_k4_norm
umap_embeddings$signal_k27_basal_norm <- df$basal_signal_k27_norm

p1 <- ggplot(umap_embeddings, aes(x = UMAP_1, y = UMAP_2, color = signal_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by total H3K27me3 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "total H3K27me3") +
  theme(legend.position = "right")  +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#BECFCC", "#00A087E5", "#005043", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))

p2 <- ggplot(umap_embeddings, aes(x = UMAP_1, y = UMAP_2, color = signal_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by total H3K4me1 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "total H3K4me1") +
  theme(legend.position = "right")  +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#A3C5D0", "#4DBBD5E5", "#006D7F", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))


p3 <- ggplot(umap_embeddings, aes(x = UMAP_1, y = UMAP_2, color = signal_k27_basal_norm, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by basal H3K27me3 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "basal H3K27me3") +
  theme(legend.position = "right") +
  theme_classic() +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#BECFCC", "#00A087E5", "#005043", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))

p4 <- ggplot(umap_embeddings, aes(x = UMAP_1, y = UMAP_2, color = signal_k27_luminal, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by luminal H3K27me3 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "luminal H3K27me3") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#BECFCC", "#00A087E5", "#005043", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))

p5 <- ggplot(umap_embeddings, aes(x = UMAP_1, y = UMAP_2, color = signal_k4_basal_norm, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by basal H3K4me1 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "basal H3K4me1") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#A3C5D0", "#4DBBD5E5", "#006D7F", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))

p6 <- ggplot(umap_embeddings, aes(x = UMAP_1, y = UMAP_2, color = signal_k4_luminal, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by luminal H3K4me1 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "luminal H3K4me1") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#A3C5D0", "#4DBBD5E5", "#006D7F", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))


p7 <- ggplot(umap_embeddings, aes(x = UMAP_1, y = UMAP_2, color = log2FC_k27_luminal_vs_basal, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by H3K27me3 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "log2FC H3K27me3 signal") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradient2(
    low = mypal[4], 
    mid = "lightgrey", 
    high = mypal[1], 
    midpoint = 0,
    limits = c(-4, 4),  # Full legend range
    oob = scales::squish,  # Caps the colors beyond the limits
    breaks = c(-4, -2, 0, 2, 4),  # Legend breaks
    labels = c( "<-4", "-2", "0", "2", ">4")  # Legend labels
  )

p8 <- ggplot(umap_embeddings, aes(x = UMAP_1, y = UMAP_2, color = log2FC_k4_luminal_vs_basal, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by H3K4me1 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "log2FC H3K4me1 signal") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradient2(
    low = mypal[4], 
    mid = "lightgrey", 
    high = mypal[1], 
    midpoint = 0,
    limits = c(-4, 4),  # Full legend range
    oob = scales::squish,  # Caps the colors beyond the limits
    breaks = c(-4, -2, 0, 2, 4),  # Legend breaks
    labels = c( "<-4", "-2", "0", "2", ">4")  # Legend labels
  )

umap_embeddings <- merge(umap_embeddings, da_activity_k27[, c("gene", "pval_k27", "padj_k27")], by = "gene", all.x = TRUE)
umap_embeddings <- merge(umap_embeddings, da_activity_k4[, c("gene", "pval_k4", "padj_k4")], by = "gene", all.x = TRUE)


p9 <- ggplot(umap_embeddings, aes(x = UMAP_1, y = UMAP_2, color = pval_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by pval of H3K27me3 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "pval") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradient(low = "red", high = "blue")


p10 <- ggplot(umap_embeddings, aes(x = UMAP_1, y = UMAP_2, color = pval_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by padj of H3K4me1 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "pval") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradient(low = "red", high = "blue")

p1

p2

p7

p8

7.5 UMAP plots (separate by mark)

umap_gene_activity_lf_pval0.05_k27 <- readRDS(file.path(outputDir_objects, "umap_gene_activity_filt10_lf_pval00.5_k27.rds"))
umap_gene_activity_lf_pval0.05_k4 <- readRDS(file.path(outputDir_objects, "umap_gene_activity_filt10_lf_pval00.5_k4.rds"))

log1p_trans <- trans_new("log1p", transform = function(x) log(x + 1), inverse = function(x) exp(x) - 1)

umap_embeddings_k27 <- as.data.frame(umap_gene_activity_lf_pval0.05_k27$layout)
colnames(umap_embeddings_k27) <- c("UMAP_1", "UMAP_2")
umap_embeddings_k27$gene <- df_k27$gene
umap_embeddings_k27 <- left_join(umap_embeddings_k27, df_k27, by = "gene")
umap_embeddings_k27$signal_k27 <- umap_embeddings_k27$basal_signal_k27 + umap_embeddings_k27$luminal_signal_k27
umap_embeddings_k27$signal_k4 <- umap_embeddings_k27$basal_signal_k4 + umap_embeddings_k27$luminal_signal_k4
umap_embeddings_k27$basal_signal_k4_norm <- umap_embeddings_k27$basal_signal_k4 / norm_factor
umap_embeddings_k27 <- merge(umap_embeddings_k27, da_activity_k27[, c("gene", "pval_k27", "padj_k27")], by = "gene", all.x = TRUE)
umap_embeddings_k27 <- merge(umap_embeddings_k27, da_activity_k4[, c("gene", "pval_k4", "padj_k4")], by = "gene", all.x = TRUE)
umap_embeddings_k27$pct_basal_k27 <- df_k27$pct_basal_k27
umap_embeddings_k27$pct_luminal_k27 <- df_k27$pct_luminal_k27
umap_embeddings_k27$entropy_k27 <- df_k27$entropy_k27
umap_embeddings_k27$entropy_k27_norm <- df_k27$entropy_k27_norm
umap_embeddings_k27$entropy_k4 <- df_k27$entropy_k4
umap_embeddings_k27$entropy_k4_norm <- df_k27$entropy_k4_norm
umap_embeddings_k27$entropy_k27_luminal <- df_k27$entropy_k27_luminal
umap_embeddings_k27$entropy_k27_basal <- df_k27$entropy_k27_basal
umap_embeddings_k27$entropy_k27_norm_binned <- df_k27$entropy_k27_norm_binned
umap_embeddings_k27$entropy_k4_norm_binned <- df_k27$entropy_k4_norm_binned

umap_embeddings_k27$entropy_k27_norm_binned <- df_k27$entropy_k27_basal_binned
umap_embeddings_k27$entropy_k4_norm_binned <- df_k27$entropy_k4_luminal_binned

umap_embeddings_k4 <- as.data.frame(umap_gene_activity_lf_pval0.05_k4$layout)
colnames(umap_embeddings_k4) <- c("UMAP_1", "UMAP_2")
umap_embeddings_k4$gene <- df_k4$gene
umap_embeddings_k4 <- left_join(umap_embeddings_k4, df_k4, by = "gene")
umap_embeddings_k4$signal_k27 <- umap_embeddings_k4$basal_signal_k27 + umap_embeddings_k4$luminal_signal_k27
umap_embeddings_k4$signal_k4 <- umap_embeddings_k4$basal_signal_k4 + umap_embeddings_k4$luminal_signal_k4
umap_embeddings_k4$basal_signal_k27_norm <- umap_embeddings_k4$basal_signal_k27 / norm_factor
umap_embeddings_k4 <- merge(umap_embeddings_k4, da_activity_k27[, c("gene", "pval_k27", "padj_k27")], by = "gene", all.x = TRUE)
umap_embeddings_k4 <- merge(umap_embeddings_k4, da_activity_k4[, c("gene", "pval_k4", "padj_k4")], by = "gene", all.x = TRUE)
umap_embeddings_k4$pct_basal_k4 <- df_k4$pct_basal_k4
umap_embeddings_k4$pct_luminal_k4 <- df_k4$pct_luminal_k4

umap_embeddings_k4$entropy_k27 <- df_k4$entropy_k27
umap_embeddings_k4$entropy_k27_norm <- df_k4$entropy_k27_norm
umap_embeddings_k4$entropy_k4 <- df_k4$entropy_k4
umap_embeddings_k4$entropy_k4_norm <- df_k4$entropy_k4_norm

7.5.1 H3K27me3

7.5.1.1 Zooming
### first look

ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = signal_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by total H3K27me3 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "total H3K27me3") +
  theme(legend.position = "right")  +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#BECFCC", "#00A087E5", "#005043", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))

### exclude the outlier dots with low coverage

umap_embeddings_k27 <- umap_embeddings_k27 %>%
  dplyr::filter(UMAP_1 >= -15 & UMAP_1 <= 5, UMAP_2 >= -10 & UMAP_2 <= 10)
7.5.1.2 Epigenomic signal
umap_k27_by_total_k27 <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = signal_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by total H3K27me3 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "total H3K27me3") +
  theme(legend.position = "right")  +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#BECFCC", "#00A087E5", "#005043", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))


umap_k27_by_total_k4 <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = signal_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by total H3K4me1 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "total H3K4me1") +
  theme(legend.position = "right")  +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#A3C5D0", "#4DBBD5E5", "#006D7F", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))


umap_k27_by_basal_k27 <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = basal_signal_k27_norm, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by basal H3K27me3 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "basal H3K27me3") +
  theme(legend.position = "right") +
  theme_classic() +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#BECFCC", "#00A087E5", "#005043", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))

umap_k27_by_luminal_k27 <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = luminal_signal_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by luminal H3K27me3 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "luminal H3K27me3") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#BECFCC", "#00A087E5", "#005043", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))

umap_k27_by_basal_k4 <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = basal_signal_k4_norm, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by basal H3K4me1 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "basal H3K4me1") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#A3C5D0", "#4DBBD5E5", "#006D7F", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))

umap_k27_by_luminal_k4 <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = luminal_signal_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by luminal H3K4me1 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "luminal H3K4me1") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#A3C5D0", "#4DBBD5E5", "#006D7F", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))


umap_k27_by_pct_basal <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = pct_basal_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by % of basal cells with signal in the gene") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "% of basal cells") +
  theme(legend.position = "right") +
  theme_classic() +
  scale_color_gradientn(colors = c("#D3D3D3", "#89cac0", "#00A087E5", "#13695b", "#00483c", "#00201b", "#001612"),
                        breaks = c(0, 25, 50, 75, 100),
                        limits = c(0, 100))

umap_k27_by_pct_luminal <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = pct_luminal_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by % of luminal cells with signal in the gene") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "% of luminal cells") +
  theme(legend.position = "right") +
  theme_classic() +
  scale_color_gradientn(colors = c("#D3D3D3", "#89cac0", "#00A087E5", "#13695b", "#00483c", "#00201b", "#001612"),
                        breaks = c(0, 25, 50, 75, 100),
                        limits = c(0, 100))

##### need to plot colored over gray
gray_points <- umap_embeddings_k27
colored_points <- umap_embeddings_k27[umap_embeddings_k27$pct_basal_k4 != 0, ]

umap_k27_by_pct_basal_k4 <- ggplot() +
  geom_point(data = gray_points, 
             aes(x = UMAP_1, y = UMAP_2), 
             color = "lightgrey", size = 1) +
  geom_point(data = colored_points, 
             aes(x = UMAP_1, y = UMAP_2, 
                 color = pct_basal_k4), 
             size = 1) +
  ggtitle("Genes colored by % of basal cells with signal in the gene") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "% of basal cells") +
  theme_classic() +
  theme(legend.position = "right") +
  scale_color_gradientn(colors = c("#D3D3D3", "#A3C5D0", "#4DBBD5E5", "#006D7F", "#104d57", "#00201b"))


gray_points <- umap_embeddings_k27
colored_points <- umap_embeddings_k27[umap_embeddings_k27$pct_luminal_k4 != 0, ]

umap_k27_by_pct_luminal_k4 <- ggplot() +
  geom_point(data = gray_points, 
             aes(x = UMAP_1, y = UMAP_2), 
             color = "lightgrey", size = 1) +
  geom_point(data = colored_points, 
             aes(x = UMAP_1, y = UMAP_2, 
                 color = pct_luminal_k4), 
             size = 1) +
  ggtitle("Genes colored by % of luminal cells with signal in the gene") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "% of luminal cells") +
  theme_classic() +
  theme(legend.position = "right") +
  scale_color_gradientn(colors = c("#D3D3D3", "#A3C5D0", "#4DBBD5E5", "#006D7F", "#104d57", "#00201b"))

  
#####

umap_k27_by_pct_all <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = pct_basal_k27 + pct_luminal_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by % of basal cells with signal in the gene") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "% of basal cells") +
  theme(legend.position = "right") +
  theme_classic() +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#BECFCC", "#00A087E5", "#005043", "#00201b"))

umap_k27_by_pct_basal

umap_k27_by_pct_luminal

ggsave(file.path(outputDir_plots, "umap_k27_by_pct_basal_green.pdf"),
       plot = umap_k27_by_pct_basal,
       device = "pdf",
       units = "mm",
       width = 130,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k27_by_pct_luminal_green.pdf"),
       plot = umap_k27_by_pct_luminal,
       device = "pdf",
       units = "mm",
       width = 133,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k27_by_pct_basal_k4_blue.pdf"),
       plot = umap_k27_by_pct_basal_k4,
       device = "pdf",
       units = "mm",
       width = 130,
       height = 100)
ggsave(file.path(outputDir_plots, "umap_k27_by_pct_liminal_k4_blue.pdf"),
       plot = umap_k27_by_pct_luminal_k4,
       device = "pdf",
       units = "mm",
       width = 133,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k27_by_total_k27.pdf"),
       plot = umap_k27_by_total_k27,
       device = "pdf",
       units = "mm",
       width = 130,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k27_by_total_k4.pdf"),
       plot = umap_k27_by_total_k4,
       device = "pdf",
       units = "mm",
       width = 130,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k27_by_basal_k27.pdf"),
       plot = umap_k27_by_basal_k27,
       device = "pdf",
       units = "mm",
       width = 130,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k27_by_luminal_k27.pdf"),
       plot = umap_k27_by_luminal_k27,
       device = "pdf",
       units = "mm",
       width = 133,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k27_by_basal_k4.pdf"),
       plot = umap_k27_by_basal_k4,
       device = "pdf",
       units = "mm",
       width = 130,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k27_by_luminal_k4.pdf"),
       plot = umap_k27_by_luminal_k4,
       device = "pdf",
       units = "mm",
       width = 133,
       height = 100)
7.5.1.3 Fold change and p-values
### LF 
## plotting non-zero points over the zero ones for better visualisation
gray_points <- umap_embeddings_k27
colored_points <- umap_embeddings_k27[umap_embeddings_k27$log2FC_k27_luminal_vs_basal != 0, ]

umap_k27_by_k27_fc <- ggplot() +
  geom_point(data = gray_points, 
             aes(x = UMAP_1, y = UMAP_2), 
             color = "lightgrey", size = 1) +
  geom_point(data = colored_points, 
             aes(x = UMAP_1, y = UMAP_2, 
                 color = log2FC_k27_luminal_vs_basal), 
             size = 1) +
  ggtitle("Genes colored by H3K27me3 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "log2FC H3K27me3 signal") +
  theme_classic() +
  theme(legend.position = "right") +
  scale_color_gradient2(
    low = mypal[4], 
    mid = "lightgrey", 
    high = mypal[1], 
    midpoint = 0,
    limits = c(-4, 4),  
    oob = scales::squish, 
    breaks = c(-4, -2, 0, 2, 4),  
    labels = c("<-4", "-2", "0", "2", ">4")  
  )


gray_points <- umap_embeddings_k27
colored_points <- umap_embeddings_k27[umap_embeddings_k27$log2FC_k4_luminal_vs_basal != 0, ]

umap_k27_by_k4_fc <-ggplot() +
  geom_point(data = gray_points, 
             aes(x = UMAP_1, y = UMAP_2), 
             color = "lightgrey", size = 1) +
  geom_point(data = colored_points, 
             aes(x = UMAP_1, y = UMAP_2, 
                 color = log2FC_k4_luminal_vs_basal), 
             size = 1) +
  ggtitle("Genes colored by H3K4me1 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "log2FC H3K4me1 signal") +
  theme_classic() +
  theme(legend.position = "right") +
  scale_color_gradient2(
    low = mypal[4], 
    mid = "lightgrey", 
    high = mypal[1], 
    midpoint = 0,
    limits = c(-4, 4),  
    oob = scales::squish, 
    breaks = c(-4, -2, 0, 2, 4),  
    labels = c("<-4", "-2", "0", "2", ">4")  
  )

### p-values
umap_k27_pval_k27 <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = pval_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by pval of H3K27me3 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "pval") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradient(low = "red", high = "blue")

umap_k27_pval_k4 <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = pval_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by pval of H3K4me1 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "pval") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradient(low = "red", high = "blue")

umap_k27_padj_k27 <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = padj_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by padj of H3K27me3 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "padj") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradient(low = "red", high = "blue")

umap_k4_padj_k4 <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = padj_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by padj of H3K4me1 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "padj") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradient(low = "red", high = "blue")


umap_k27_by_k27_fc

ggsave(file.path(outputDir_plots, "umap_k27_by_k4_fc.pdf"),
       plot = umap_k27_by_k4_fc,
       device = "pdf",
       units = "mm",
       width = 145,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k27_by_k27_fc.pdf"),
       plot = umap_k27_by_k27_fc,
       device = "pdf",
       units = "mm",
       width = 145,
       height = 100)
7.5.1.4 Entropy
 umap_k27_by_k27_entropy <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = entropy_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by intercellular entropy of H3K27me3 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "Shannon's entropy") +
  theme(legend.position = "right")  +
  theme_classic() + 
  scale_color_gradientn(colors = c("#ede6ed", "#e6dae6", "#dbbfdb", "#c491c4", "#7A527A", "#4C334C", "#261b26"))

umap_k27_by_k4_entropy <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = entropy_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by intercellular entropy of H3K4me1 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "Shannon's entropy") +
  theme(legend.position = "right")  +
  theme_classic() + 
  scale_color_gradientn(colors = c("#ede6ed", "#e6dae6", "#dbbfdb", "#c491c4", "#7A527A", "#4C334C", "#261b26"))


umap_k27_by_k27_entropy_luminal <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = entropy_k27_luminal, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by intercellular entropy of H3K27me3 signal, luminal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "Shannon's entropy") +
  theme(legend.position = "right")  +
  theme_classic() + 
  scale_color_gradientn(colors = c("#ede6ed", "#e6dae6", "#dbbfdb", "#c491c4", "#7A527A", "#4C334C", "#261b26"))


umap_k27_by_k27_entropy_basal <- ggplot(umap_embeddings_k27, aes(x = UMAP_1, y = UMAP_2, color = entropy_k27_basal, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by intercellular entropy of H3K27me3 signal, basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "Shannon's entropy") +
  theme(legend.position = "right")  +
  theme_classic() + 
  scale_color_gradientn(colors = c("#ede6ed", "#e6dae6", "#dbbfdb", "#c491c4", "#7A527A", "#4C334C", "#261b26"))



umap_k27_by_k27_entropy

umap_k27_by_k4_entropy

ggsave(file.path(outputDir_plots, "umap_k27_by_k4_entropy.pdf"),
       plot = umap_k27_by_k4_entropy,
       device = "pdf",
       units = "mm",
       width = 134,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k27_by_k27_entropy.pdf"),
       plot = umap_k27_by_k27_entropy,
       device = "pdf",
       units = "mm",
       width = 134,
       height = 100)

7.5.2 H3K4me1

7.5.2.1 Zooming
ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = signal_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by total H3K4me1 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "total H3K4me1") +
  theme(legend.position = "right")  +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#A3C5D0", "#4DBBD5E5", "#006D7F", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))

### exclude the outlier dots with low coverage

umap_embeddings_k4 <- umap_embeddings_k4 %>%
  dplyr::filter(UMAP_1 >= -14 & UMAP_1 <= 5, UMAP_2 >= -5 & UMAP_2 <= 7)
7.5.2.2 Epigenomic signal
umap_k4_by_total_k4 <-ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = signal_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by total H3K4me1 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "total H3K4me1") +
  theme(legend.position = "right")  +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#A3C5D0", "#4DBBD5E5", "#006D7F", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))

umap_k4_by_total_k27 <- ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = signal_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by total H3K27me3 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "total H3K27me3") +
  theme(legend.position = "right")  +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#BECFCC", "#00A087E5", "#005043", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))

umap_k4_by_basal_k4 <- ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = basal_signal_k4_norm, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by basal H3K4me1 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "basal H3K4me1") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#A3C5D0", "#4DBBD5E5", "#006D7F", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))

umap_k4_by_luminal_k4 <- ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = luminal_signal_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by luminal H3K4me1 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "luminal H3K4me1") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#A3C5D0", "#4DBBD5E5", "#006D7F", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))


umap_k4_by_basal_k27 <- ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = basal_signal_k27_norm, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by basal H3K27me3 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "basal H3K27me3") +
  theme(legend.position = "right") +
  theme_classic() +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#BECFCC", "#00A087E5", "#005043", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))

umap_k4_by_luminal_k27 <- ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = luminal_signal_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by luminal H3K27me3 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "luminal H3K27me3") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradientn(colors = c("#D3D3D3", "#D3D3D3", "#BECFCC", "#00A087E5", "#005043", "#00201b"),
                        trans = log1p_trans,
                        breaks = c(0, 10, 20, 50, 100, 200, 400, 600))


umap_k4_by_pct_basal <- ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = pct_basal_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by % of basal cells with signal in the gene") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "% of basal cells") +
  theme(legend.position = "right") +
  theme_classic() +
  scale_color_gradientn(colors = c("#D3D3D3", "#A3C5D0", "#4DBBD5E5", "#006D7F", "#104d57", "#00201b", "#001612"),
                        breaks = c(0, 25, 50, 75, 100),
                        limits = c(0, 100))

umap_k4_by_pct_luminal <- ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = pct_luminal_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by % of luminal cells with signal in the gene") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "% of luminal cells") +
  theme(legend.position = "right") +
  theme_classic() +
  scale_color_gradientn(colors = c("#D3D3D3", "#A3C5D0", "#4DBBD5E5", "#006D7F", "#104d57", "#00201b", "#001612"),
                        breaks = c(0, 25, 50, 75, 100),
                        limits = c(0, 100))


gray_points <- umap_embeddings_k4
colored_points <- umap_embeddings_k4[umap_embeddings_k4$pct_basal_k27 != 0, ]

umap_k4_by_pct_basal_k27 <- ggplot() +
  geom_point(data = gray_points, 
             aes(x = UMAP_1, y = UMAP_2), 
             color = "lightgrey", size = 1) +
  geom_point(data = colored_points, 
             aes(x = UMAP_1, y = UMAP_2, 
                 color = pct_basal_k4), 
             size = 1) +
  ggtitle("Genes colored by % of basal cells with signal in the gene") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "% of basal cells") +
  theme_classic() +
  theme(legend.position = "right") +
  scale_color_gradientn(colors = c("#D3D3D3", "#89cac0", "#00A087E5", "#13695b", "#00483c", "#00201b"))

gray_points <- umap_embeddings_k4
colored_points <- umap_embeddings_k4[umap_embeddings_k4$pct_luminal_k27 != 0, ]

umap_k4_by_pct_luminal_k27 <- ggplot() +
  geom_point(data = gray_points, 
             aes(x = UMAP_1, y = UMAP_2), 
             color = "lightgrey", size = 1) +
  geom_point(data = colored_points, 
             aes(x = UMAP_1, y = UMAP_2, 
                 color = pct_luminal_k4), 
             size = 1) +
  ggtitle("Genes colored by % of luminal cells with signal in the gene") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "% of luminal cells") +
  theme_classic() +
  theme(legend.position = "right") +
  scale_color_gradientn(colors = c("#D3D3D3", "#89cac0", "#00A087E5", "#13695b", "#00483c", "#00201b"))


umap_k4_by_pct_basal

umap_k4_by_pct_luminal

ggsave(file.path(outputDir_plots, "umap_k4_by_pct_basal_k27_green.pdf"),
       plot = umap_k4_by_pct_basal_k27,
       device = "pdf",
       units = "mm",
       width = 130,
       height = 100)
ggsave(file.path(outputDir_plots, "umap_k4_by_pct_liminal_k27_green.pdf"),
       plot = umap_k4_by_pct_luminal_k27,
       device = "pdf",
       units = "mm",
       width = 133,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k4_by_pct_basal_blue.pdf"),
       plot = umap_k4_by_pct_basal,
       device = "pdf",
       units = "mm",
       width = 130,
       height = 100)
ggsave(file.path(outputDir_plots, "umap_k4_by_pct_liminal_blue.pdf"),
       plot = umap_k4_by_pct_luminal,
       device = "pdf",
       units = "mm",
       width = 133,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k4_by_total_k27.pdf"),
       plot = umap_k4_by_total_k27,
       device = "pdf",
       units = "mm",
       width = 130,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k4_by_total_k4.pdf"),
       plot = umap_k4_by_total_k4,
       device = "pdf",
       units = "mm",
       width = 130,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k4_by_basal_k27.pdf"),
       plot = umap_k4_by_basal_k27,
       device = "pdf",
       units = "mm",
       width = 130,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k4_by_luminal_k27.pdf"),
       plot = umap_k4_by_luminal_k27,
       device = "pdf",
       units = "mm",
       width = 133,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k4_by_basal_k4.pdf"),
       plot = umap_k4_by_basal_k4,
       device = "pdf",
       units = "mm",
       width = 130,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k4_by_luminal_k4.pdf"),
       plot = umap_k4_by_luminal_k4,
       device = "pdf",
       units = "mm",
       width = 133,
       height = 100)
7.5.2.3 Fold change and p-values
### LF 
## plotting non-zero points over the zero ones for better visualisation
gray_points <- umap_embeddings_k4
colored_points <- umap_embeddings_k4[umap_embeddings_k4$log2FC_k4_luminal_vs_basal != 0, ]

umap_k4_by_k4_fc <- ggplot() +
  geom_point(data = gray_points, 
             aes(x = UMAP_1, y = UMAP_2), 
             color = "lightgrey", size = 1) +
  geom_point(data = colored_points, 
             aes(x = UMAP_1, y = UMAP_2, 
                 color = log2FC_k4_luminal_vs_basal), 
             size = 1) +
  ggtitle("Genes colored by H3K4me1 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "log2FC H3K4me1 signal") +
  theme_classic() +
  theme(legend.position = "right") +
  scale_color_gradient2(
    low = mypal[4], 
    mid = "lightgrey", 
    high = mypal[1], 
    midpoint = 0,
    limits = c(-4, 4),  
    oob = scales::squish, 
    breaks = c(-4, -2, 0, 2, 4),  
    labels = c("<-4", "-2", "0", "2", ">4")  
  )

gray_points <- umap_embeddings_k4
colored_points <- umap_embeddings_k4[umap_embeddings_k4$log2FC_k27_luminal_vs_basal != 0, ]

umap_k4_by_k27_fc <- ggplot() +
  geom_point(data = gray_points, 
             aes(x = UMAP_1, y = UMAP_2), 
             color = "lightgrey", size = 1) +
  geom_point(data = colored_points, 
             aes(x = UMAP_1, y = UMAP_2, 
                 color = log2FC_k27_luminal_vs_basal), 
             size = 1) +
  ggtitle("Genes colored by H3K27me3 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "log2FC H3K27me3 signal") +
  theme_classic() +
  theme(legend.position = "right") +
  scale_color_gradient2(
    low = mypal[4], 
    mid = "lightgrey", 
    high = mypal[1], 
    midpoint = 0,
    limits = c(-4, 4),  
    oob = scales::squish, 
    breaks = c(-4, -2, 0, 2, 4),  
    labels = c("<-4", "-2", "0", "2", ">4")  
  )

### p-values
umap_k4_by_k27_pval <- ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = pval_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by pval of H3K27me3 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "pval") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradient(low = "red", high = "blue")

umap_k4_by_k4_pval <- ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = pval_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by pval of H3K4me1 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "pval") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradient(low = "red", high = "blue")

umap_k4_by_k27_padj <- ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = padj_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by padj of H3K27me3 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "padj") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradient(low = "red", high = "blue")

umap_k4_by_k4_padj <- ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = padj_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by padj of H3K4me1 signal change in luminal vs basal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "padj") +
  theme(legend.position = "right") +
  theme_classic() + 
  scale_color_gradient(low = "red", high = "blue")


umap_k4_by_k4_fc

ggsave(file.path(outputDir_plots, "umap_k4_by_k4_fc.pdf"),
       plot = umap_k4_by_k4_fc,
       device = "pdf",
       units = "mm",
       width = 145,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k4_by_k27_fc.pdf"),
       plot = umap_k4_by_k27_fc,
       device = "pdf",
       units = "mm",
       width = 145,
       height = 100)
7.5.2.4 Entropy
umap_k4_by_k4_entropy <- ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = entropy_k4, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by intercellular entropy of H3K4me1 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "Shannon's entropy") +
  theme(legend.position = "right")  +
  theme_classic() + 
  scale_color_gradientn(colors = c("#ede6ed", "#e6dae6", "#dbbfdb", "#c491c4", "#7A527A", "#4C334C", "#261b26"))


umap_k4_by_k27_entropy <- ggplot(umap_embeddings_k4, aes(x = UMAP_1, y = UMAP_2, color = entropy_k27, label = gene)) +
  geom_point(size = 1) +
  theme_minimal() +
  ggtitle("Genes colored by intercellular entropy of H3K27me3 signal") +
  xlab("UMAP Dimension 1") +
  ylab("UMAP Dimension 2") +
  labs(color = "Shannon's entropy") +
  theme(legend.position = "right")  +
  theme_classic() + 
  scale_color_gradientn(colors = c("#ede6ed", "#e6dae6", "#dbbfdb", "#c491c4", "#7A527A", "#4C334C", "#261b26"))


umap_k4_by_k4_entropy

umap_k4_by_k27_entropy

ggsave(file.path(outputDir_plots, "umap_k4_by_k4_entropy.pdf"),
       plot = umap_k4_by_k4_entropy,
       device = "pdf",
       units = "mm",
       width = 134,
       height = 100)

ggsave(file.path(outputDir_plots, "umap_k4_by_k27_entropy.pdf"),
       plot = umap_k4_by_k27_entropy,
       device = "pdf",
       units = "mm",
       width = 134,
       height = 100)